library(tidyverse)
library(AmesHousing)
library(recipes)
library(caret)
library(rpart)
library(rpart.plot)
library(ranger)
library(xgboost)
library(AUC)
getModelInfo("rpart", FALSE)$rpart
$label
[1] "CART"
$library
[1] "rpart"
$type
[1] "Regression" "Classification"
$parameters
$grid
function(x, y, len = NULL, search = "grid"){
dat <- if(is.data.frame(x)) x else as.data.frame(x)
dat$.outcome <- y
initialFit <- rpart::rpart(.outcome ~ .,
data = dat,
control = rpart::rpart.control(cp = 0))$cptable
initialFit <- initialFit[order(-initialFit[,"CP"]), , drop = FALSE]
if(search == "grid") {
if(nrow(initialFit) < len) {
tuneSeq <- data.frame(cp = seq(min(initialFit[, "CP"]),
max(initialFit[, "CP"]),
length = len))
} else tuneSeq <- data.frame(cp = initialFit[1:len,"CP"])
colnames(tuneSeq) <- "cp"
} else {
tuneSeq <- data.frame(cp = unique(sample(initialFit[, "CP"], size = len, replace = TRUE)))
}
tuneSeq
}
$loop
function(grid) {
grid <- grid[order(grid$cp, decreasing = FALSE),, drop = FALSE]
loop <- grid[1,,drop = FALSE]
submodels <- list(grid[-1,,drop = FALSE])
list(loop = loop, submodels = submodels)
}
$fit
function(x, y, wts, param, lev, last, classProbs, ...) {
cpValue <- if(!last) param$cp else 0
theDots <- list(...)
if(any(names(theDots) == "control"))
{
theDots$control$cp <- cpValue
theDots$control$xval <- 0
ctl <- theDots$control
theDots$control <- NULL
} else ctl <- rpart::rpart.control(cp = cpValue, xval = 0)
## check to see if weights were passed in (and availible)
if(!is.null(wts)) theDots$weights <- wts
modelArgs <- c(list(formula = as.formula(".outcome ~ ."),
data = if(is.data.frame(x)) x else as.data.frame(x),
control = ctl),
theDots)
modelArgs$data$.outcome <- y
out <- do.call(rpart::rpart, modelArgs)
if(last) out <- rpart::prune.rpart(out, cp = param$cp)
out
}
$predict
function(modelFit, newdata, submodels = NULL) {
if(!is.data.frame(newdata)) newdata <- as.data.frame(newdata)
pType <- if(modelFit$problemType == "Classification") "class" else "vector"
out <- predict(modelFit, newdata, type=pType)
if(!is.null(submodels))
{
tmp <- vector(mode = "list", length = nrow(submodels) + 1)
tmp[[1]] <- out
for(j in seq(along = submodels$cp))
{
prunedFit <- rpart::prune.rpart(modelFit, cp = submodels$cp[j])
tmp[[j+1]] <- predict(prunedFit, newdata, type=pType)
}
out <- tmp
}
out
}
$prob
function(modelFit, newdata, submodels = NULL) {
if(!is.data.frame(newdata)) newdata <- as.data.frame(newdata)
out <- predict(modelFit, newdata, type = "prob")
if(!is.null(submodels))
{
tmp <- vector(mode = "list", length = nrow(submodels) + 1)
tmp[[1]] <- out
for(j in seq(along = submodels$cp))
{
prunedFit <- rpart::prune.rpart(modelFit, cp = submodels$cp[j])
tmpProb <- predict(prunedFit, newdata, type = "prob")
tmp[[j+1]] <- as.data.frame(tmpProb[, modelFit$obsLevels, drop = FALSE])
}
out <- tmp
}
out
}
$predictors
function(x, surrogate = TRUE, ...) {
out <- as.character(x$frame$var)
out <- out[!(out %in% c("<leaf>"))]
if(surrogate)
{
splits <- x$splits
splits <- splits[splits[,"adj"] > 0,]
out <- c(out, rownames(splits))
}
unique(out)
}
$varImp
function(object, surrogates = FALSE, competes = TRUE, ...) {
if(nrow(object$splits)>0) {
tmp <- rownames(object$splits)
rownames(object$splits) <- 1:nrow(object$splits)
splits <- data.frame(object$splits)
splits$var <- tmp
splits$type <- ""
frame <- as.data.frame(object$frame)
index <- 0
for(i in 1:nrow(frame)) {
if(frame$var[i] != "<leaf>") {
index <- index + 1
splits$type[index] <- "primary"
if(frame$ncompete[i] > 0) {
for(j in 1:frame$ncompete[i]) {
index <- index + 1
splits$type[index] <- "competing"
}
}
if(frame$nsurrogate[i] > 0) {
for(j in 1:frame$nsurrogate[i]) {
index <- index + 1
splits$type[index] <- "surrogate"
}
}
}
}
splits$var <- factor(as.character(splits$var))
if(!surrogates) splits <- subset(splits, type != "surrogate")
if(!competes) splits <- subset(splits, type != "competing")
out <- aggregate(splits$improve,
list(Variable = splits$var),
sum,
na.rm = TRUE)
} else {
out <- data.frame(x = numeric(), Vaiable = character())
}
allVars <- colnames(attributes(object$terms)$factors)
if(!all(allVars %in% out$Variable)) {
missingVars <- allVars[!(allVars %in% out$Variable)]
zeros <- data.frame(x = rep(0, length(missingVars)),
Variable = missingVars)
out <- rbind(out, zeros)
}
out2 <- data.frame(Overall = out$x)
rownames(out2) <- out$Variable
out2
}
$levels
function(x) x$obsLevels
$trim
function(x) {
x$call <- list(na.action = (x$call)$na.action)
x$x <- NULL
x$y <- NULL
x$where <- NULL
x
}
$tags
[1] "Tree-Based Model" "Implicit Feature Selection"
[3] "Handle Missing Predictor Data" "Accepts Case Weights"
$sort
function(x) x[order(x[,1], decreasing = TRUE),]
modelo_rpart <- train(
receita,
credit_data %>% filter(base == "treino") %>% select(-base),
method = "rpart",
metric = "ROC",
trControl = train_control_rpart,
tuneGrid = grid_rpart
)
Preparing recipe
+ Fold1: cp=-0.001
- Fold1: cp=-0.001
+ Fold2: cp=-0.001
- Fold2: cp=-0.001
+ Fold3: cp=-0.001
- Fold3: cp=-0.001
+ Fold4: cp=-0.001
- Fold4: cp=-0.001
+ Fold5: cp=-0.001
- Fold5: cp=-0.001
Aggregating results
Selecting tuning parameters
Fitting cp = 2e-04 on full training set
pdf("arvore.pdf", 20, 10)
rpart.plot(modelo_rpart$finalModel)
dev.off()
null device
1
caret::confusionMatrix(
predict(modelo_rpart, credit_data_teste),
credit_data_teste$Status,
mode = "everything"
)
Confusion Matrix and Statistics
Reference
Prediction bad good
bad 191 145
good 195 853
Accuracy : 0.7543
95% CI : (0.7308, 0.7768)
No Information Rate : 0.7211
P-Value [Acc > NIR] : 0.002910
Kappa : 0.364
Mcnemar's Test P-Value : 0.007875
Sensitivity : 0.4948
Specificity : 0.8547
Pos Pred Value : 0.5685
Neg Pred Value : 0.8139
Precision : 0.5685
Recall : 0.4948
F1 : 0.5291
Prevalence : 0.2789
Detection Rate : 0.1380
Detection Prevalence : 0.2428
Balanced Accuracy : 0.6748
'Positive' Class : bad
infos$grid
function(x, y, len = NULL, search = "grid") {
if(search == "grid") {
srule <-
if (is.factor(y))
"gini"
else
"variance"
out <- expand.grid(mtry =
caret::var_seq(p = ncol(x),
classification = is.factor(y),
len = len),
min.node.size = ifelse( is.factor(y), 1, 5),
splitrule = c(srule, "extratrees"))
} else {
srules <- if (is.factor(y))
c("gini", "extratrees")
else
c("variance", "extratrees", "maxstat")
out <-
data.frame(
min.node.size= sample(1:(min(20,nrow(x))), size = len, replace = TRUE),
mtry = sample(1:ncol(x), size = len, replace = TRUE),
splitrule = sample(srules, size = len, replace = TRUE)
)
}
out
}
modelo_rf <- train(
receita,
credit_data %>% filter(base %in% "treino") %>% select(-base),
method = "ranger", #PREENCHA AQUI
importance = "permutation",
metric = "ROC",
trControl = train_control_rf,
tuneLength = 5
)
Preparing recipe
+ Fold1: mtry= 2, min.node.size=1, splitrule=gini
- Fold1: mtry= 2, min.node.size=1, splitrule=gini
+ Fold1: mtry= 6, min.node.size=1, splitrule=gini
- Fold1: mtry= 6, min.node.size=1, splitrule=gini
+ Fold1: mtry=10, min.node.size=1, splitrule=gini
- Fold1: mtry=10, min.node.size=1, splitrule=gini
+ Fold1: mtry=14, min.node.size=1, splitrule=gini
- Fold1: mtry=14, min.node.size=1, splitrule=gini
+ Fold1: mtry=18, min.node.size=1, splitrule=gini
- Fold1: mtry=18, min.node.size=1, splitrule=gini
+ Fold1: mtry= 2, min.node.size=1, splitrule=extratrees
- Fold1: mtry= 2, min.node.size=1, splitrule=extratrees
+ Fold1: mtry= 6, min.node.size=1, splitrule=extratrees
- Fold1: mtry= 6, min.node.size=1, splitrule=extratrees
+ Fold1: mtry=10, min.node.size=1, splitrule=extratrees
- Fold1: mtry=10, min.node.size=1, splitrule=extratrees
+ Fold1: mtry=14, min.node.size=1, splitrule=extratrees
- Fold1: mtry=14, min.node.size=1, splitrule=extratrees
+ Fold1: mtry=18, min.node.size=1, splitrule=extratrees
- Fold1: mtry=18, min.node.size=1, splitrule=extratrees
+ Fold2: mtry= 2, min.node.size=1, splitrule=gini
- Fold2: mtry= 2, min.node.size=1, splitrule=gini
+ Fold2: mtry= 6, min.node.size=1, splitrule=gini
- Fold2: mtry= 6, min.node.size=1, splitrule=gini
+ Fold2: mtry=10, min.node.size=1, splitrule=gini
- Fold2: mtry=10, min.node.size=1, splitrule=gini
+ Fold2: mtry=14, min.node.size=1, splitrule=gini
- Fold2: mtry=14, min.node.size=1, splitrule=gini
+ Fold2: mtry=18, min.node.size=1, splitrule=gini
- Fold2: mtry=18, min.node.size=1, splitrule=gini
+ Fold2: mtry= 2, min.node.size=1, splitrule=extratrees
- Fold2: mtry= 2, min.node.size=1, splitrule=extratrees
+ Fold2: mtry= 6, min.node.size=1, splitrule=extratrees
- Fold2: mtry= 6, min.node.size=1, splitrule=extratrees
+ Fold2: mtry=10, min.node.size=1, splitrule=extratrees
- Fold2: mtry=10, min.node.size=1, splitrule=extratrees
+ Fold2: mtry=14, min.node.size=1, splitrule=extratrees
- Fold2: mtry=14, min.node.size=1, splitrule=extratrees
+ Fold2: mtry=18, min.node.size=1, splitrule=extratrees
- Fold2: mtry=18, min.node.size=1, splitrule=extratrees
+ Fold3: mtry= 2, min.node.size=1, splitrule=gini
- Fold3: mtry= 2, min.node.size=1, splitrule=gini
+ Fold3: mtry= 6, min.node.size=1, splitrule=gini
- Fold3: mtry= 6, min.node.size=1, splitrule=gini
+ Fold3: mtry=10, min.node.size=1, splitrule=gini
- Fold3: mtry=10, min.node.size=1, splitrule=gini
+ Fold3: mtry=14, min.node.size=1, splitrule=gini
- Fold3: mtry=14, min.node.size=1, splitrule=gini
+ Fold3: mtry=18, min.node.size=1, splitrule=gini
- Fold3: mtry=18, min.node.size=1, splitrule=gini
+ Fold3: mtry= 2, min.node.size=1, splitrule=extratrees
- Fold3: mtry= 2, min.node.size=1, splitrule=extratrees
+ Fold3: mtry= 6, min.node.size=1, splitrule=extratrees
- Fold3: mtry= 6, min.node.size=1, splitrule=extratrees
+ Fold3: mtry=10, min.node.size=1, splitrule=extratrees
- Fold3: mtry=10, min.node.size=1, splitrule=extratrees
+ Fold3: mtry=14, min.node.size=1, splitrule=extratrees
- Fold3: mtry=14, min.node.size=1, splitrule=extratrees
+ Fold3: mtry=18, min.node.size=1, splitrule=extratrees
- Fold3: mtry=18, min.node.size=1, splitrule=extratrees
+ Fold4: mtry= 2, min.node.size=1, splitrule=gini
- Fold4: mtry= 2, min.node.size=1, splitrule=gini
+ Fold4: mtry= 6, min.node.size=1, splitrule=gini
- Fold4: mtry= 6, min.node.size=1, splitrule=gini
+ Fold4: mtry=10, min.node.size=1, splitrule=gini
- Fold4: mtry=10, min.node.size=1, splitrule=gini
+ Fold4: mtry=14, min.node.size=1, splitrule=gini
- Fold4: mtry=14, min.node.size=1, splitrule=gini
+ Fold4: mtry=18, min.node.size=1, splitrule=gini
- Fold4: mtry=18, min.node.size=1, splitrule=gini
+ Fold4: mtry= 2, min.node.size=1, splitrule=extratrees
- Fold4: mtry= 2, min.node.size=1, splitrule=extratrees
+ Fold4: mtry= 6, min.node.size=1, splitrule=extratrees
- Fold4: mtry= 6, min.node.size=1, splitrule=extratrees
+ Fold4: mtry=10, min.node.size=1, splitrule=extratrees
- Fold4: mtry=10, min.node.size=1, splitrule=extratrees
+ Fold4: mtry=14, min.node.size=1, splitrule=extratrees
- Fold4: mtry=14, min.node.size=1, splitrule=extratrees
+ Fold4: mtry=18, min.node.size=1, splitrule=extratrees
- Fold4: mtry=18, min.node.size=1, splitrule=extratrees
+ Fold5: mtry= 2, min.node.size=1, splitrule=gini
- Fold5: mtry= 2, min.node.size=1, splitrule=gini
+ Fold5: mtry= 6, min.node.size=1, splitrule=gini
- Fold5: mtry= 6, min.node.size=1, splitrule=gini
+ Fold5: mtry=10, min.node.size=1, splitrule=gini
- Fold5: mtry=10, min.node.size=1, splitrule=gini
+ Fold5: mtry=14, min.node.size=1, splitrule=gini
- Fold5: mtry=14, min.node.size=1, splitrule=gini
+ Fold5: mtry=18, min.node.size=1, splitrule=gini
- Fold5: mtry=18, min.node.size=1, splitrule=gini
+ Fold5: mtry= 2, min.node.size=1, splitrule=extratrees
- Fold5: mtry= 2, min.node.size=1, splitrule=extratrees
+ Fold5: mtry= 6, min.node.size=1, splitrule=extratrees
- Fold5: mtry= 6, min.node.size=1, splitrule=extratrees
+ Fold5: mtry=10, min.node.size=1, splitrule=extratrees
- Fold5: mtry=10, min.node.size=1, splitrule=extratrees
+ Fold5: mtry=14, min.node.size=1, splitrule=extratrees
- Fold5: mtry=14, min.node.size=1, splitrule=extratrees
+ Fold5: mtry=18, min.node.size=1, splitrule=extratrees
- Fold5: mtry=18, min.node.size=1, splitrule=extratrees
Aggregating results
Selecting tuning parameters
Fitting mtry = 2, splitrule = gini, min.node.size = 1 on full training set
caret::confusionMatrix(predict(modelo_rf, credit_data_teste), credit_data_teste$Status, mode = "everything")
Confusion Matrix and Statistics
Reference
Prediction bad good
bad 146 41
good 240 957
Accuracy : 0.797
95% CI : (0.7748, 0.8179)
No Information Rate : 0.7211
P-Value [Acc > NIR] : 5.126e-11
Kappa : 0.4005
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.3782
Specificity : 0.9589
Pos Pred Value : 0.7807
Neg Pred Value : 0.7995
Precision : 0.7807
Recall : 0.3782
F1 : 0.5096
Prevalence : 0.2789
Detection Rate : 0.1055
Detection Prevalence : 0.1351
Balanced Accuracy : 0.6686
'Positive' Class : bad
Exercício: Ajuste um xgboost usando o caret e responda: qual modelo apresenta a maior AUC? crtl+C ctrl+V por sua conta!
DICA 1) troque “ranger” por “xgbTree” DICA 2) rode info <- getModelInfo("xgbTree", FALSE)$xgbTree e depois consulte info$parameters. DICA 3) experimente usar o parâmetro tuneLength = 20 em vez do `tuneGrid.